#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2010 Marcel Martin <marcel.martin@tu-dortmund.de>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

"""%prog [options] <FASTA/FASTQ FILE> [<QUALITY FILE>]

Reads a FASTA or FASTQ file, finds and removes adapters,
and writes the changed sequence to standard output.
When finished, statistics are printed to standard error.

If two file names are given, they are assumed to be
.csfasta and .qual files as produced by the SOLiD sequencer.
(You still need to provide the -c option to correctly deal
with color space.)

If the name of any input or output file ends with '.gz', it is
assumed to be gzip-compressed.

If you want to search for the reverse complement of an adapter, you must
provide an additional adapter sequence using two '-a' parameters.

If the input sequences are in color space, the adapter must
also be provided in color space (using a string of digits 0123).

EXAMPLE

Assuming your sequencing data is available as a FASTQ file, use this
command line:
$ cutadapt -e ERROR-RATE -a ADAPTER-SEQUENCE input.fastq > output.fastq"""

from __future__ import print_function, division

__version__ = '0.8'

import sys
import re
import gzip
import time
from string import maketrans
from optparse import OptionParser
from itertools import izip
from contextlib import closing
from collections import defaultdict

import align
import fasta

# constants for the find_best_alignment function
BACK = align.START_WITHIN_SEQ2 | align.STOP_WITHIN_SEQ2 | align.STOP_WITHIN_SEQ1
ANYWHERE = align.SEMIGLOBAL


class FormatError(Exception):
	pass


class Error(Exception):
	pass


class UnknownFileType(Exception):
	pass


class HelpfulOptionParser(OptionParser):
	"""An OptionParser that prints full help on errors."""
	def error(self, msg):
		self.print_help(sys.stderr)
		self.exit(2, "\n%s: error: %s\n" % (self.get_prog_name(), msg))


def xopen(filename, mode='r', is_closing=True):
	"""
	Replacement for the "open" function that can also open
	files that have been compressed with gzip. If the filename ends with .gz,
	the file is opened with gzip.open(). If it doesn't, the regular open()
	is used.
	closing -- whether to wrap the returned GzipFile with contextlib.closing
		to make it usable in 'with' statements. (TODO look for a nicer solution)
	"""
	if filename.endswith('.gz'):
		if is_closing:
			return closing(gzip.open(filename, mode))
		else:
			return gzip.open(filename, mode)
	else:
		return open(filename, mode)


def print_histogram(d):
	"""d -- a dictionary mapping values to their respective frequency"""
	print("length", "count", sep="\t")
	for key in sorted(d):
		print(key, d[key], sep="\t")
	print()
	
	
def print_statistics(adapters, stats, error_rate):
	print("cutadapt version", __version__)
	print("Command line parameters:", " ".join(sys.argv[1:]))
	print("Maximum error rate: %.2f%%" % (error_rate * 100.))
	print("   Processed reads:", stats.n)
	print("     Trimmed reads:", stats.reads_changed, "(%5.1f%%)" % (100. * stats.reads_changed / stats.n))
	print("   Too short reads:", stats.too_short, "(%5.1f%% of processed reads)" % (100. * stats.too_short / stats.n))
	print("        Total time: %9.2f s" % stats.time)
	print("     Time per read: %9.2f ms" % (1000. * stats.time / stats.n))
	print()
	for index, (where, adapter) in enumerate(adapters):
		total_front = sum(stats.lengths_front[index].values())
		total_back = sum(stats.lengths_back[index].values())
		total = total_front + total_back
		assert where == ANYWHERE or (where == BACK and total_front == 0)
		
		print("=" * 3, "Adapter", index+1, "=" * 3)
		print()
		print("Adapter '%s'," % adapter, "length %d," % len(adapter), "was trimmed", total, "times.")
		if where == ANYWHERE:
			print(total_front, "times, it overlapped the 5' end of a read")
			print(total_back, "times, it overlapped the 3' end or was within the read")
			print()
			print("Histogram of adapter lengths (5')")
			print_histogram(stats.lengths_front[index])
			print()
			print("Histogram of adapter lengths (3' or within)")
			print_histogram(stats.lengths_back[index])
		else:
			print()
			print("Histogram of adapter lengths")
			print_histogram(stats.lengths_back[index])


def quality_to_ascii(quality_line, base=33):
	"""
	Convert a string containing qualities given as integer to a string of 
	ASCII-encoded qualities.

	base -- ASCII code of quality zero (sensible values are 33 and 64).

	>>> quality_to_ascii("17 4 29 18")
	'2%>3'
	"""
	fields = map(int, quality_line.split())
	qualities = ''.join(chr(q+base) for q in fields)
	return qualities


def find_best_alignment(adapters, seq, max_error_rate, minimum_overlap):
	"""
	Find the best matching adapter.

	adapters -- List of adapter sequences
	seq -- The sequence to which each adapter will be aligned
	where -- Where in the sequence the adapter may be found. 
		One of BACK and FRONT_OR_BACK. For both, 
		the adapter will also be found if it is in the middle.
	max_error_rate -- Maximum allowed error rate. The error rate is
		the number of errors in the alignment divided by the length
		of the part of the alignment that matches the adapter.

	Return tuple (best_alignment, best_index).

	best_alignment is an alignment as returned by semiglobalalign.
	best_index is the index of the best adapter into the adapters list.
	"""
	best_score = 0
	best_alignment = None
	best_index = None
	for index, (where, adapter) in enumerate(adapters):
		# try to find an exact match first
		pos = seq.find(adapter)
		if pos >= 0:
			alignment = (None, None, 0, len(adapter), pos, pos + len(adapter), 0)
		else:
			alignment = align.globalalign(adapter, seq, where)
		(r1, r2, astart, astop, rstart, rstop, errors) = alignment
		length = astop - astart
		if length < minimum_overlap or errors/length > max_error_rate:
			continue

		# the length of the matching part of the adapter minus errors
		# determines which adapter fits best
		score = length - errors
		if score > best_score:
			best_alignment = alignment
			best_score = score
			best_index = index
	return (best_alignment, best_index)


def fastafiletype(fname):
	"""
	Determine file type of fname. Return the string FASTQ or FASTA or
	raise an UnknownFileType exception.
	"""
	with xopen(fname) as f:
		for line in f:
			if line.startswith('#'):
				continue
			if line.startswith('@'):
				return 'FASTQ'
			if line.startswith('>'):
				return 'FASTA'
			raise UnknownFileType("neither FASTQ nor FASTA")


def read_sequences(seqfilename, qualityfilename, colorspace):
	"""
	Read sequences and (if available) quality information from either:
	* seqfilename in FASTA format (qualityfilename must be None)
	* seqfilename in FASTQ format (qualityfilename must be None)
	* seqfilename in .csfasta format and qualityfilename in .qual format 
	  (SOLiD color space)

	Return a generator over tuples (description, sequence, qualities).
	qualities is None if no qualities are available.
	"""
	ftype = fastafiletype(seqfilename)

	if ftype == 'FASTQ' and qualityfilename is not None:
		raise Error("If a FASTQ file is given, no quality file can be provided.")

	if ftype == 'FASTQ':
		with xopen(seqfilename) as seqfile:
			for desc, seq, qualities in fasta.readfastq(seqfile, colorspace=colorspace):
				yield (desc, seq, qualities)
	elif ftype == 'FASTA' and qualityfilename is None:
		with xopen(seqfilename) as seqfile:
			for desc, seq in fasta.readfasta(seqfile):
				yield (desc, seq, None)
	else:
		assert ftype == 'FASTA' and qualityfilename is not None
		lengthdiff = 1 if colorspace else 0
		with xopen(seqfilename) as seqfile:
			with xopen(qualityfilename) as qualityfile:
				seq_iter = fasta.readfasta(seqfile)
				qual_iter = fasta.readfasta(qualityfile)
				for (qdesc, qseq), (rdesc, rseq) in izip(qual_iter, seq_iter):
					if qdesc != rdesc:
						raise FormatError("Descriptions in FASTA and quality file don't match (%s and %s)." % (rdesc, qdesc))
					qualities = quality_to_ascii(qseq)
					if len(rseq) == 0 and colorspace:
						raise FormatError("When reading '%s', no sequence was found (at least the initial primer must appear)." % rdesc)
					if len(qualities) != len(rseq) - lengthdiff:
						raise FormatError("While reading '%s': expected to find %d quality values, but found %d." % (rdesc, len(rseq) - lengthdiff, len(qualities)))
					yield rdesc, rseq, qualities


# TODO refactor this into an AdapterCutter class
def process_read(desc, seq, qualities, options, adapters, stats):
	# In colorspace, the first character is the last nucleotide of the primer base
	# and the second character encodes the transition from the primer base to the
	# first real base of the read.
	if options.trim_primer:
		seq = seq[2:]
		if qualities is not None:
			qualities = qualities[1:]
		initial = ''
	elif options.colorspace:
		initial = seq[0]
		seq = seq[1:]
	else:
		initial = ''

	any_adapter_matches = False

	# try (possibly more than once) to remove an adapter
	for t in xrange(options.times):
		alignment, index = find_best_alignment(adapters, seq, options.error_rate, options.overlap)
		if alignment is None:
			# nothing found
			break

		(r1, r2, astart, astop, rstart, rstop, errors) = alignment
		length = astop - astart
		assert length > 0
		assert errors/length <= options.error_rate
		assert length - errors > 0

		any_adapter_matches = True
		where, adapter = adapters[index]
		
		if where == BACK or (astart == 0 and rstart > 0):
			assert where == ANYWHERE or astart == 0
			# The adapter is at the end of the read (case 2) or in the middle (case 3)
			if rstop < len(seq):
				# The adapter is in the middle of the read (case 3)
				if options.rest_file is not None:
					print(seq[rstop:], file=options.rest_file)
			stats.lengths_back[index][length] += 1

			if options.colorspace:
				# trim one more color if long enough
				rstart = max(0, rstart - 1)
			seq = seq[:rstart]
			if qualities is not None:
				qualities = qualities[:rstart]
				assert len(qualities) == len(seq)
		elif where == ANYWHERE:
			# The adapter is in the beginning of the read (case 4)
			assert rstart == 0
			stats.lengths_front[index][length] += 1
			# TODO What should we do in color space?
			seq = seq[rstop:]
			if qualities is not None:
				qualities = qualities[rstop:]
		else:
			assert False

	if any_adapter_matches:
		stats.reads_changed += 1
	if options.discard and any_adapter_matches:
		return (None, None, None)
	if len(seq) < options.minimum_length:
		stats.too_short += 1
		return (None, None, None)

	# other modifications to the sequence or its description
	# change any length=XX that may appear in the read description (for 454)
	if desc.find("length=") >= 0:
		desc = lengthregex.sub("length=%d" % len(seq), desc)

	if options.strip_f3 and desc.endswith('_F3'):
		desc = desc[:-3]
	desc = options.prefix + desc + options.suffix
	if options.double_encode:
		# convert color space sequence to double-encoded colorspace (using
		# characters ACGTN to represent colors)
		seq = seq.translate(trans)

	if options.colorspace:
		assert not qualities or len(seq) == len(qualities)
	return (desc, initial + seq, qualities)


def main():
	parser = HelpfulOptionParser(usage=__doc__, version=__version__)
	parser.add_option("-e", "--error-rate", type=float, default=0.1,
		help="Maximum allowed error rate (no. of errors divided by the length of the matching region) (default: %default)")
	parser.add_option("-o", "--output", default=None,
		help="The modified sequence gets written to this file. The format is FASTQ if qualities are available, FASTA otherwise. (default: standard output)")
	parser.add_option("-a", "--adapter", action="append", dest="adapters",
		help="Sequence of an adapter that was ligated to the 3' end. The adapter itself and anything that follows is trimmed. If multiple -a or -b options are given, only the best matching adapter is trimmed.")
	parser.add_option("-b", "--anywhere", action="append", metavar="ADAPTER",
		help="Sequence of an adapter that was ligated to the 5' or 3' end. If the adapter is found within the read or overlapping the 3' end of the read, the behavior is the same as for the -a option. If the adapter overlaps the 5' end (beginning of the read), the initial portion of the read matching the adapter is trimmed, but anything that follows is kept. If multiple -a or -b options are given, only the best matching adapter is trimmed.")
	parser.add_option("--discard", action='store_true', default=False,
		help="Discard reads that contain the adapter instead of trimming them. Also use -O in order to avoid throwing away too many reads!")
	parser.add_option("-n", "--times", type=int, metavar="COUNT", default=1,
		help="Try to remove adapters at most COUNT times. Useful when an adapter gets appended multiple times.")
	parser.add_option("-O", "--overlap", type=int, metavar="LENGTH", default=3,
		help="Minimum overlap length. If the overlap between the adapter and the sequence is shorter than LENGTH, the read is not modified.")
	parser.add_option("-m", "--minimum-length", type=int, default=0, metavar="LENGTH",
		help="Discard trimmed reads that are shorter than LENGTH. Reads that are too short even before adapter removal are also discarded. In colorspace, an initial primer is not counted.")
	parser.add_option("-r", "--rest-file", default=None,
		help="When the adapter matches in the middle of a read, write the rest (after the adapter) into a file. Use - for standard output.")
	parser.add_option("-x", "--prefix", default='',
		help="Add this prefix to read names")
	parser.add_option("-y", "--suffix", default='',
		help="Add this suffix to read names")
	parser.add_option("-c", "--colorspace", action='store_true', default=False,
		help="Colorspace mode: Also trim the color that is adjacent to the found adapter.")
	parser.add_option("-d", "--double-encode", action='store_true', default=False,
		help="When in color space, double-encode colors (map 0,1,2,3,4 to A,C,G,T,N).")
	parser.add_option("-t", "--trim-primer", action='store_true', default=False,
		help="When in color space, trim primer base and the first color (which is the transition to the first nucleotide)")
	parser.add_option("--strip-f3", action='store_true', default=False,
		help="For color space: Strip the _F3 suffix of read names")
	parser.add_option("--maq", "--bwa", action='store_true', default=False,
		help="MAQ/BWA-compatible color space output. This enables -c, -d, -t, --strip-f3 and -y '/1'.")

	options, args = parser.parse_args()

	if len(args) == 0:
		parser.error("At least one parameter needed: name of a FASTA or FASTQ file.")
	elif len(args) > 2:
		parser.error("Too many parameters.")

	input_filename = args[0]
	quality_filename = None
	if len(args) == 2:
		quality_filename = args[1]
	if options.output is None:
		outfile = sys.stdout
	else:
		outfile = xopen(options.output, 'w', is_closing=False)

	if options.maq:
		options.colorspace = True
		options.double_encode = True
		options.trim_primer = True
		options.strip_f3 = True
		options.suffix = "/1"
	if options.trim_primer and not options.colorspace:
		parser.error("Trimming the primer makes only sense in color space.")
	if options.double_encode and not options.colorspace:
		parser.error("Double-encoding makes only sense in color space.")
	if options.anywhere and options.colorspace:
		parser.error("Using --anywhere with color space reads is currently not supported  (if you think this may be useful, contact the author).")
	if not (0 <= options.error_rate <= 1.):
		parser.error("The maximum error rate must be between 0 and 1.")

	if options.rest_file is not None:
		if options.rest_file == '-':
			options.rest_file = sys.stdout
		else:
			options.rest_file = xopen(options.rest_file, 'w', is_closing=False)

	adapters = []
	if options.adapters:
		adapters = [ (BACK, adapter) for adapter in options.adapters ]
	if options.anywhere:
		adapters += [ (ANYWHERE, adapter) for adapter in options.anywhere ]
	del options.adapters
	del options.anywhere

	if not adapters:
		print("You need to provide at least one adapter sequence.", file=sys.stderr)
		return 1

	# TODO refactor this (put it into the AdapterCutter class
	class Statistics(object):
		pass
	stats = Statistics()
	stats.reads_changed = 0
	stats.too_short = 0
	stats.lengths_front = []
	for a in adapters:
		stats.lengths_front.append(defaultdict(int))
	stats.lengths_back = []
	for a in adapters:
		stats.lengths_back.append(defaultdict(int))

	# for double-encoding colors pace sequences
	global trans
	trans = maketrans('0123.', 'ACGTN')

	global lengthregex
	lengthregex = re.compile(r"\blength=[0-9]*\b")
	# length difference between sequence and qualities
	lengthdiff = 0 if options.trim_primer else 1

	# count the reads in which at least one adapter has been removed
	n = 0
	qualities = None
	start_time = time.clock()
	try:
		reader = read_sequences(input_filename, quality_filename, colorspace=options.colorspace)
		for (desc, seq, qualities) in reader:
			n += 1
			desc, seq, qualities = process_read(desc, seq, qualities, options, adapters, stats)
			if seq is not None:
				# write modified sequence in either FASTA or FASTQ format
				if qualities is None:
					# FASTA
					print('>%s\n%s' % (desc, seq), file=outfile)
				else:
					# FASTQ
					print('@%s\n%s\n+\n%s' % (desc, seq, qualities), file=outfile)
	except FormatError as e:
		print(e, file=sys.stderr)
		return 1

	stats.time = time.clock()
	stats.n = n
	sys.stdout = sys.stderr
	print_statistics(adapters, stats, options.error_rate)
	sys.stdout = sys.__stdout__
	return 0


if __name__ == '__main__':
	if len(sys.argv) > 1 and sys.argv[1] == '--profile':
		del sys.argv[1]
		import cProfile as profile
		profile.run('main()', 'cutadapt.prof')
	else:
		sys.exit(main())

"""
Interpreting the result of a semiglobal alignment between the adapter and the read sequences
============================================================================================

There are nine cases, which can be grouped into four categories.

A: character of the adapter sequence (r1)
R: character of the read sequence    (r2)
-: gap symbol

The middle of the alignment is always like this:
AAAA
RRRR

For the part of that is before the middle and for the part that is after the middle,
there are three possibilities each (making 3*3=9)

A  and -  and  'nothing'
-      R

Grouping by semantics of those alignments, we get the following three categories.


1. The adapter covers or parts of it cover the entire read
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

a)
AAAA
RRRR

b)
AAAA
RR--

c)
AAAA
--RR

1d)
AAAA
-RR-

2. The adapter is at the end of the read
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

a)
---AAA
RRRRRR

b)
---AAAA
RRRRR--

3. The adapter is in the middle of the read
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

---AAA---
RRRRRRRRR

4. The adapter is in the beginning of the read
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

a)
AAAA----
RRRRRRRR

b)
AAAA----
--RRRRRR
"""
